############################################################ ########### #
#
# Project: Stability and change in the opinion-policy relationship
# 
# This script validates the IPUMS census data against Guess etal's census frequencies
# (intended for internal use)
# 
# 2022.09.30. 
############################################################ ########### #


library(rio)
library(tidyverse)

# copy statecode - statename combinations from codebook
states_abb <- data.frame(
        state_codes = c(01, 02, 04, 05, 06, 08, 09, 10, 11, 12, 13, 15, 16, 17, 18, 
                        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 
                        34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49,
                        50, 51, 53, 54, 55, 56), 
        state = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", 
                  "Colorado", "Connecticut", "Delaware", "District of Columbia", 
                  "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", 
                  "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", 
                  "Massachusetts", "Michigan", "Minnesota", "Mississippi",
                  "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", 
                  "New Jersey", "New Mexico", "New York", "North Carolina", 
                  "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", 
                  "Rhode Island", "South Carolina", "South Dakota", "Tennessee", 
                  "Texas", "Utah", "Vermont", "Virginia", "Washington", 
                  "West Virginia", "Wisconsin", "Wyoming"))



# Quality check against Guess's '17 census --------------------------------

census17 <- import("A-original-data/census2017.csv") %>% 
        # filter out minors
        filter(AGE >= 18) %>% 
        # create a clean dataframe
        transmute(year = YEAR,
                  # grouping age into 4 groups. 18-29-44-64- 
                  age = as.numeric(cut(AGE, 
                                       breaks = c(18, 29, 44, 64, 
                                                  max(AGE, na.rm = TRUE)), 
                                       labels = 1:4, 
                                       include.lowest = TRUE)),
                  female = SEX - 1, 
                  # grouping education into 5 groups
                  edu = case_when(EDUCD < 61 ~ "nohs", 
                                  EDUCD >= 62 & EDUCD <= 64 ~ "hs", 
                                  EDUCD >= 65 & EDUCD <= 100 ~ "somecol", 
                                  EDUCD <= 101 & EDUCD <= 113 ~ "col", 
                                  EDUCD >= 114 ~ "postgrad", 
                                  TRUE ~ NA_character_),
                  # (not not) hispanic and not black = hisp 3
                  # black (even if also hispanic) = black 2
                  # white and other = 1
                  racecat = case_when(HISPAN != 0 & RACBLK == 1 ~ 2 , 
                                      RACBLK == 2 ~ 1, 
                                      TRUE ~ 0),
                  state_codes = STATEFIP,
                  count = PERWT
        ) %>% 
        group_by(year, age, female, edu, racecat, state_codes) %>% 
        summarise(freq=sum(count)) %>% 
        ungroup() %>% 
        complete(year, age, female, edu, racecat, state_codes, 
                 fill = list(freq = 0)) %>%
        left_join(states_abb) %>%
        mutate(state = tolower(state)) %>% 
        # dplyr::select(-state_abb, -year ) %>%
        glimpse


guess <- import("A-original-data/pstrat_acs17.csv") %>% 
        rename(racecat = "race") %>% 
        mutate(state = tolower(state_name)) %>% 
        select(-state_abb, -state_name) %>% 
        glimpse()

mydf <- left_join(census17, guess, 
                  by = c("age", "female", "edu", "racecat", "state"),
                  suffix = c(".ipums", ".guess")) %>% 
        glimpse

cor.test(mydf$freq.ipums, mydf$freq.guess)

ggplot(mydf, aes(x = freq.guess, y = freq.ipums)) + 
        geom_point(alpha = .1) + 
        geom_smooth() + 
        facet_wrap(~state, scales = "free") +
        theme_minimal()

sum(guess$freq)
sum(census17$freq)

plot(tapply(guess$freq, guess$state, sum), 
     tapply(census17$freq, census17$state, sum))

round(cbind(tapply(guess$freq, guess$racecat, function(x) sum(x/sum(guess$freq))), 
            tapply(census17$freq, census17$racecat, function(x) sum(x/sum(census17$freq)))), 2)

round(cbind(tapply(guess$freq, guess$age, function(x) sum(x/sum(guess$freq))), 
            tapply(census17$freq, census17$age, function(x) sum(x/sum(census17$freq)))), 2)

round(cbind(tapply(guess$freq, guess$female, function(x) sum(x/sum(guess$freq))), 
            tapply(census17$freq, census17$female, function(x) sum(x/sum(census17$freq)))), 2)


guess %>%  filter(state_name == "California" & age == 1) %>% 
        select(-age) 

census17 %>%  filter(state == "california" & age == 1) %>% 
        select(-age) 

mydf %>%  filter(state == "california" & age == 1) %>% 
        select(-age) 
